*-----------------------------------------------------------------
version 11
cap clear
cap clear matrix
cap log close
set more off
set mem 1500m
cd "E:\REStat_MS14767_Vol96(2)\Data preparation Compustat segment"
log using 10_patentvars.log, replace
*-----------------------------------------------------------------

*****************************************************************
*** compustat_master_all_countries.dta to				    *
*** clean_compustat_master.dta 					    *
*** This file creates the variable S by taking patent infos and *
*** r&d-expenditure together.						    *
*** 2,500,000 entries remain in the patent data.		    *
*****************************************************************

**********************
*In this part, patent data and compustat data are brought together.
***********************

use "raw data\apat63_99.dta", clear
so assignee
drop if gyear<1969

joinby assignee using "patent files\cusipnamemerged_red.dta",unmatched(both) 

tab _merge 

**********************************************************
*
*                       _merge |      Freq.     Percent        Cum.
*------------------------------+-----------------------------------
*          only in master data |  1,960,613       75.37       75.37
*both in master and using data |    640,872       24.63      100.00
*------------------------------+-----------------------------------
*                        Total |  2,601,485      100.00
*************************************************************

keep if _merge==3
drop _merge

*************************
* Check observations
count if assignee!=assignee[_n-1]
* must give number around 3590
***************************

*********************************
* Here some vars are generated
* If some further patent informationshall be kept, 
* the egen-line can easily be copied for each variable.
***************************

rename gyear year
so ticker year
drop if ticker==""
gen t=1
label var t "counting variable"
egen numpatentsy=sum(t), by(ticker year) 
label var numpatentsy "number of patents in a certain year"
egen claimsy=sum(claims), by(ticker year) 
label var claimsy "sum of claims in a certain year"
gen avclaims=claimsy/numpatentsy
label var avclaims "claimsy divided by numpatentsy"
drop t

/*

********************************************************
* we generate the patent stock, lagged vars
********************************************************

by ticker: gen yearweight=year[_n+1]-year if year!=year[_n+1]
gen xxx=numpatentsy
replace xxx=0 if xxx==.
so ticker year
by ticker: gen patents=xxx+xxx[_n-1]*(1-0.15)^yearweight if year!=year[_n-1]
label var patents "patentstock, depreciated"
drop xxx
replace patents=patents[_n-1] if year==year & ticker==ticker[_n-1] & patents==.
replace patents=0 if patents==.
so ticker year
by ticker: gen patents_t_1=patents[_n-1]
label var patents_t_1 "lagged patentstock, lag 1"
by ticker: gen patents_t_2=patents[_n-2]
label var patents_t_1 "lagged patentstock, lag 2"
by ticker: gen patents_t_3=patents[_n-3]
label var patents_t_1 "lagged patentstock, lag 3"
by ticker: gen patents_t_4=patents[_n-4]
label var patents_t_1 "lagged patentstock, lag 4"

*/

*********************************************************
* creating a panel
*********************************************************

save "patent files\apat69_99_cusip.dta", replace
drop if year>1998
save "patent files\apat69_98_cusip.dta", replace
drop if year>1997
save "patent files\apat69_97_cusip.dta", replace
drop if year>1996
save "patent files\apat69_96_cusip.dta", replace
drop if year>1995
save "patent files\apat69_95_cusip.dta", replace
drop if year>1994
save "patent files\apat69_94_cusip.dta", replace
drop if year>1993
save "patent files\apat69_93_cusip.dta", replace
drop if year>1992
save "patent files\apat69_92_cusip.dta", replace
drop if year>1991
save "patent files\apat69_91_cusip.dta", replace
drop if year>1990
save "patent files\apat69_90_cusip.dta", replace
drop if year>1989
save "patent files\apat69_89_cusip.dta", replace
drop if year>1988
save "patent files\apat69_88_cusip.dta", replace
drop if year>1987
save "patent files\apat69_87_cusip.dta", replace
drop if year>1986
save "patent files\apat69_86_cusip.dta", replace

forvalues z=86/99 {
use "patent files\apat69_`z'_cusip.dta", clear
egen t=group(cusip)
egen T=max(t)
replace T=T+1
drop t

order cusip
so cusip

gen k=0
replace k=1  if subcat==11       
replace k=2  if subcat==12       
replace k=3  if subcat==13       
replace k=4  if subcat==14       
replace k=5  if subcat==15       
replace k=6  if subcat==19       
replace k=7  if subcat==21       
replace k=8  if subcat==22       
replace k=9  if subcat==23       
replace k=10 if subcat==24       
replace k=11 if subcat==31       
replace k=12 if subcat==32       
replace k=13 if subcat==33       
replace k=14 if subcat==39       
replace k=15 if subcat==41       
replace k=16 if subcat==42       
replace k=17 if subcat==43       
replace k=18 if subcat==44       
replace k=19 if subcat==45       
replace k=20 if subcat==46       
replace k=21 if subcat==49       
replace k=22 if subcat==51       
replace k=23 if subcat==52       
replace k=24 if subcat==53       
replace k=25 if subcat==54       
replace k=26 if subcat==55       
replace k=27 if subcat==59       
replace k=28 if subcat==61       
replace k=29 if subcat==62       
replace k=30 if subcat==63       
replace k=31 if subcat==64       
replace k=32 if subcat==65       
replace k=33 if subcat==66       
replace k=34 if subcat==67       
replace k=35 if subcat==68       
replace k=36 if subcat==69 

gen t=1
egen ct=sum(t), by(cusip k) /*sum of patents in that catagory*/
drop t year appyear nclass cat subcat patent
so cusip k
drop if cusip==cusip[_n+1] & k==k[_n+1]

gen kk1 =ct
gen kk2 =ct
gen kk3 =ct
gen kk4 =ct
gen kk5 =ct
gen kk6 =ct
gen kk7 =ct
gen kk8 =ct
gen kk9 =ct
gen kk10=ct
gen kk11=ct
gen kk12=ct
gen kk13=ct
gen kk14=ct
gen kk15=ct
gen kk16=ct
gen kk17=ct
gen kk18=ct
gen kk19=ct
gen kk20=ct
gen kk21=ct
gen kk22=ct
gen kk23=ct
gen kk24=ct
gen kk25=ct
gen kk26=ct
gen kk27=ct
gen kk28=ct
gen kk29=ct
gen kk30=ct
gen kk31=ct
gen kk32=ct
gen kk33=ct
gen kk34=ct
gen kk35=ct
gen kk36=ct

drop if cusip=="."

egen id=group(cusip k)

************************
* Dataset is reshaped here
************************

reshape long kk, i(id) j(j)
drop id

replace kk=0 if j~=k

so cusip j kk
drop if cusip==cusip[_n+1] & j==j[_n+1]                       
drop k ct

****************************
* Dataset is reshaped again
*************************

egen id=group(cusip j)
reshape wide kk, i(id) j(j)
drop id

forvalues i=1/36 {
egen k`i'=max(kk`i'), by(cusip)
drop kk`i'
}

count if cusip==cusip[_n-1]
drop if cusip==cusip[_n-1]

************************************
* Here starts the variable building
********************************

set matsize 5000

******************************
* The patent distribution information for all firms is stored in one matrix M, where each
* row of the matrix gives the patent distribution of one firm. The dimensions of the matrix
* will N*36, where N is the number of firms in the dataset
*****************************

mkmat k1 k2 k3 k4 k5 k6 k7 k8 k9 k10 k11 k12 k13 k14 k15 k16 k17 k18 k19 k20 k21 k22 k23 k24 k25 k26 k27 k28 k29 k30 k31 k32 k33 k34 k35 k36, matrix(M)

******************************
* The matrix MS is calculated is the product of M with its transpose. The resulting matrix
* will be a square matrix of dimension N*N. Each element of the matrix will be euqal to 
* the inner product of a pair of firms' patent-distribution-vectors, where the diagonal
* elements are the inner product of one firm's patent distribution with itself, and they
* are thus sums of squares. The latter elements are stored in the row vector d.
*****************************

matrix MS=M*M'
matrix d=vecdiag(MS)

*****************************
* The outer product of the vector of sums of squares (d) is stored as the N*N matrix D.
* Each element of the matrix will thus be equal to the product of two sums of squares.
*******************************

matrix D=d'*d

egen id=group(cusip)

****************************
* The colums of the matrices MS and D are stored as variables, so that functions can
* be applied to the single elements of the matrices.
************************

svmat D
svmat MS

**************************
* The following loop first takes the square root of each observation in each D-variable
* (i.e. of each element in matrix D). Now all elemnts of MS and D that have the same
* position (row i, column j) constitute, respectively, the numerator and the denominator
* for an uncentered correlation coeffecient between the patent distributions of firms
* i and j. Thus, a fraction of each of these pairs is calculated and stored in MS
* which now constitutes a correlation matrix for ALL firms, only that the diagonal 
* elements (which are 1) are set to zero, as, later on, a firms own r&d expenditures
* are not supposed to enter its potential spillover pool.
************************************

local i=1
while `i'<T {
replace D`i'=sqrt(D`i')
replace MS`i'=MS`i'/D`i'
drop D`i'
replace MS`i'=0 if id==`i'
local i=`i'+1
}


************************************
* In the following loop, for each row of the technological correlation matrix MS
* (which refers to one firm), all elements are set to zero which measure the technology
* correletion between the firm and another firm that does NOT belong to the same
* SIC4 industry. This procedure could be applied to any groups of firms, i.e. 
* firms belonging to the same SIC2/SIC3 industry, or the group of insiders and 
* outsiders of one rjv as we defined them.
**************************************

*Changes for the segment data file:
*Since each firm can be in several Industries the following code had to be changed.
  
* First the SIC-information is added. The amount of Industries and the INdustries itself can change 
* over time fpr each firm.
* Thus the year specific SIC-Information has to be added. 



merge 1:m ticker using "compustat+segment.dta", keepusing(year SIC) keep(match)
keep if year == 1986 

drop year _merge



sort cusip SIC

egen s = seq(), by(id)
egen S = max(s), by(id)

egen id_sic=group(SIC)
local i=1
while `i'<T {
	gen ns = S if id ==`i'
	egen NS = max(ns)
	replace NS = NS+1	
    qui: sum NS
	local check = r(N)
	if `check' !=0{	
		gen match = 0
		local s = 1
		while `s'<NS{	
			gen n=id_sic if id==`i' & s == `s'
			egen nn=max(n)
			replace match = 1 if id_sic==nn
			drop n nn 
			local s=`s'+1
		}
		egen m = max(match), by(id)
		replace MS`i'=0 if m ==0
		drop NS ns	match m
		local i=`i'+1	
	}
	
	else{
	drop NS ns MS`i'
	local i=`i'+1
	}
}
keep if s == 1
	
drop s S SIC id_sic
save "patent files\proximity86_`z'.dta", replace
}

**************************
* The information on r&d-expenditures from the compustat data is isolated.
****************************

use "compustat+segment.dta", clear
keep SIC ticker name  year rd_new rd_corr cusip

so cusip year
drop if rd_corr==.

save "patent files\segment_master_rd.dta", replace

*******************************
* The r&d information is seperated into year specific datasets. Observations
* where are r&d-expenditures are not observed are DROPPED. I.e., whenever the r&d-expenditure
* variable takes on the value zero, this means that zero expenditures are actually
* OBSERVED.
*******************************

forvalues z=86/99 {
use "patent files\segment_master_rd.dta", clear
keep if year==19`z'  
save "patent files\segment_master_rd`z'.dta",replace
}

**********************************
* Now, for each year, the dataset containing the correlation matrices are merged
* with the r&d-expenditure information. What makes the issue tricky is that there
* potentially are firms observed in the correlation-matrix data set stemming from 
* the patent data that are not observed in the r&d-expenditure dataset because the 
* variable was not observed in compustat for these firms. In such cases, not only
* the corresponding row for the firm must be dropped, but also the corresponding
* column. Meanwhile, new id's must be assigned in order not to mess up the order
* of rows and columns.
************************************



set more off
forvalues z=86/99 {
use "patent files\proximity86_`z'.dta", clear

merge 1:m cusip  using "patent files\segment_master_rd`z'.dta"
drop if _merge==2
egen d = group(cusip) if _merge==3
egen id_new= max(d), by(cusip)
drop d
egen u=group(cusip) if _merge==1 & id_new==.
egen tt=max(u)
replace tt=tt+1
local f=1
while `f'<tt {
	gen dro=id if u==`f'
	egen dropp=max(dro)
	local dr=dropp
	drop MS`dr'
	drop dro dropp
	local f=`f'+1
}
local f=1
drop tt u
egen tt=max(id_new)
replace tt=tt+1
while `f'<tt {
	gen p=id_new if id_new==`f'
	egen s=max(p)
	drop p
	local p=s
	drop s
	gen q=id if id_new==`f'
	egen r=max(q)
	drop q
	local q=r
	drop r
	rename MS`q' MS`p'a
	rename MS`p'a MS`p'

	local f=`f'+1
}
drop if _merge==1
drop _merge

* Here the Spillover is calculated with the  Sic-sqecific r&d-Expenditure. From that SIC  where both firm meet
* each other. 
egen id_sic=group(SIC)
egen s = seq(), by(id)
egen SS = max(s), by(id)	
gen S=0
	local i=1
	while `i'<tt{
	gen ns = SS if id_new==`i'
	egen NS = max(ns)
	replace NS = NS+1	
    	gen match = 0
		local s = 1
		while `s'<NS{	
			gen n=id_sic if id_new==`i' & s == `s'
			egen nn=max(n)
			replace match = 1 if id_sic==nn
			drop n nn 
			local s=`s'+1
		}
		gen MM = MS`i'
		replace MM = 0 if match==0
		replace MM = MM*rd_corr
		egen M = total(MM)
		replace S = M if id_new ==`i'		
		drop NS ns	match MM M
		local i=`i'+1		
}

* Here the spillover is calculeted using the aggregated R&D expenditure over all SIC the firm is in, in that year.
	egen rd_tot = total(rd_corr), by(id_new)
	keep if s == 1
	drop s SS rd_corr
	local m = tt-1
	mkmat MS1-MS`m', matrix(MM)
	mkmat rd_tot, matrix(rd)
	matrix Sa=MM*rd
	svmat Sa
	rename Sa1 Sa
	drop tt 
	save "patent files\spillover_pool_`z'.dta", replace
}



forvalues z=86/99 {
use "patent files\spillover_pool_`z'.dta", clear
keep SIC S Sa year T cusip ticker numpatentsy claimsy avclaims
save "patent files\spillover_pool_`z'_red.dta", replace
}

use "patent files\spillover_pool_86_red.dta", clear
forvalues z=87/99 {
append using "patent files\spillover_pool_`z'_red.dta"
}

*gen S=.
*forvalues z=86/87 {
*replace S=S`z' if S`z'~=.
*drop S`z'
*}

label var S  "spillover variable with sic specific rd, patents"
label var Sa "spillover variable with aggregated rd, patents"


so ticker year
save "patent files\spillover_pool_panel.dta", replace

keep ticker year SIC cusip S Sa numpatentsy claimsy avclaims 
so cusip year 
order cusip ticker year

desc

save "patentvars.dta", replace



********************************
* This resulting data set contains various data on patents and the new variable S.
* It may now be merged with the whole compustat database.
*******************************

log close













